# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# File Name          : run_paper_data_analysis_figures.r
# Programmer Name    : Luis F. Campos
#                     luiscampos@g.harvard.edu
#
# Purpose            : This file contains code to visualize results presented 
#					   in Section 7 of "Worth Weighting? How to Think 
#					   About and Use Sample Weights in Survey Experiments"
#
# Input              : The .Rda files creates in run_paper_data_analysis.R
# Output             : pdf files and tables
# 
# References         : None
#
#
# Platform           : R
# Version            : v3.3.0
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Replication Information:
#   We do expect differences when running this code on different systems 
#   (and with different seeds), but the differences all tend to be quite 
#   small and the general trends remain. If you would like to replicate 
#   the exact numbers and figures used in the article, in the readme you 
#   will find the session info. Please install and load all package 
#   versions (including the R version) to ensure the exact replication. 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #



set.seed(3)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 	Please run both 
#		1. "run_paper_data_analysis.R" 
#		2. "run_paper_data_analysis_noPS.R"
#   before running this.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #



# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Sections:
# 	1. Some basic summaries: shows summaries of the survey experiments 
# 	   referenced in Section 7, first pragraphs
# 	2. Figures and significance analysis: makes figures presented in 
# 	   paper, Figures 2 and 3 as well as presents analysis for delta
# 	3. Appendix data analysis: rerun analysis in 2 after subsetting only 
#      to primary outcome
#	4. Post-Stratification Analysis: We analyze the experiments with 
#      and without post-stratifying on party to compare their variances. 
#      This is done for both the main set of experiments (as in 2) and 
#      the subset of experiments (as in 3)
# 	5. Make Experiments Table for Appendix: Creates template for Table 1 
#      in Appendix
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #





# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 					    1. Some basic summaries
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
out_dir = './'
# read in output created in "run_paper_data_analysis.R"
res = read.csv(file = paste(out_dir, 'final_results.csv', sep = ''))[,-1]
res1 = res
tb1 = cbind(c('tau.sate.sig', 'tau.hh.sig', 'tau.ps.hh.sig'), rbind(table(res1$tau.sate.sig), table(res1$tau.hh.sig), table(res1$tau.ps.hh.sig)))
colnames(tb1) = c('Est', 'Not.Sig', 'Sig')

# Number of significant and non-significant experiments [*Note: Results Used in Section 6, page 21*]
tb1
#      Est             Not.Sig Sig 
# [1,] "tau.sate.sig"  "27"    "65"
# [2,] "tau.hh.sig"    "41"    "51"
# [3,] "tau.ps.hh.sig" "41"    "51"

# Range of sample size [*Note: Results Used in Section 6, page 21*]
range(as.numeric(as.character(res1$n)))
# [1] 145 504

# grab survey names
surveys = levels(res1$survey) 
# relabel for plotting
levels(res1$survey) = 1:length(unique(res1$survey)) 

# Table of Experiments [*Note: Results Used in Section 6, page 20*]
tab = data.frame(Experiments = table(res1$survey), N = c(1000, 2000, 1000, 1000, 1000, 1000, 1200))
tab[,1] = surveys
tab

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
#   Experiments.Var1 Experiments.Freq    N
# 1       CCES10_EIT                4 1000 X
# 2       CCES10_UCB                2 2000 X
# 3           CCES12               36 1000 X
# 4         CCES2014               14 1000 X
# 5       UCB_Follow               10 1000 X
# 6         Virginia               16 1000 X
# 7           YouGov               10 1200 X
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

# Fraction Significant [*Note: Results Used in Section 6, page 21*]
as.numeric(tb1[,3])/(dim(res1)[1])
# [1] 0.7065217 0.5543478 0.5543478


# Number of unique Outcome and Randomization Pairs [*Note: Results Used in Section 6, page 20*]
length(unique(paste(res1$outcome, res1$treatment)))
# [1] 41
# plus 5 since transgress appears twice

length(unique(paste(res1$treatment))) # [*Note: Results Used in Appendix, page 17*]
# 17
# plus one since transgress appears twice

# Increase of Variance for hh, relative to sate [*Note: Results Used in Section 6, page 21 -- average increase in SE when attempting to estimate PATE*]
summary((res1[,"tau.hh.SE"]/res1[,"tau.sate.SE"]))
 #   Min. 1st Qu.  Median  *Mean* 3rd Qu.    Max. 
 # 0.7597  1.0760  1.2940  1.3060  1.4520  2.2060 
summary((res1[,"tau.ps.hh.SE"]/res1[,"tau.hh.SE"]))  # [*Note: Results Used in Section 6, page 22 -- average increase in SE when post-stratifying on weight*]
 #   Min. 1st Qu.  Median  *Mean* 3rd Qu.    Max. 
 # 0.8919  0.9979  1.0070  1.0040  1.0140  1.0430 
# Delta Mean [*Note: Results Used in Section 6, page 21 - average delta*]
mean((res1[,"tau.sate.est"]-res1[,"tau.hh.est"])/res1[,"tau.star1.SE"])
# [1] -0.1159847






# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 				2. Figures and significance analysis
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

pch = c(15, 17, 19, 1, 2, 5, 6)

pdf(file = paste('figure2.pdf', sep = ''), width = 5, height = 6)
	par(mar=c(5.1,4.1,2.1,0.5))
	hist(res1[,"tau.hh.SE"]/res1[,"tau.sate.SE"], breaks = 25, xlab = expression(SE(hat(tau)[hh])/SE(hat(tau)[SATE])), main = 'Relative Efficiency', col = 'grey', cex.lab = 1.3, cex.axis = 1.3, xaxt = 'n')
	axis(1, at = seq(0.8, 2.2, 0.2), lab = seq(0.8, 2.2, 0.2))
	abline(v = 1, col = '#778899', lwd = 2)
dev.off()

pdf(file = paste('figure3.pdf', sep = ''), width = 5, height = 6)
	par(mar=c(5.1,4.1,2.1,0.5))
	qqnorm((res1[,"tau.sate.est"]-res1[,"tau.hh.est"])/res1[,"tau.star1.SE"], ylab = "Standard Normal", xlab = expression(hat(delta)==(hat(tau)[SATE]-hat(tau)[hh])/widehat(SE)), main = 'qq-Plot: Relative Difference', pch = pch[as.numeric(as.character(res1$survey))], cex.lab = 1.3, cex.axis = 1.3)
		abline(a = 0, b = 1, col = '#778899')
	legend('topleft', surveys, pch =  pch)
dev.off()


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Analyze delta multiple for multiple comparisons
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
 # [*Note: Results Used in Section 6, page 22*]
delta = (res1[,"tau.sate.est"]-res1[,"tau.hh.est"])/res1[,"tau.star1.SE"]
pvals = apply(cbind(pnorm(delta), pnorm(-delta)), 1, min)
length(pvals)
# [1] 92
sum(pvals < 0.025)
# [1] 4
sum(pvals < 0.05)
# [1] 7
library('astsa')
FDR(pvals)
# NULL







# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 					3. Appendix Data Analysis
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #



# Keep only experiments with primary outcomes (should be 36)

res$keep.indicator = 0
res$keep.indicator[c(1:4, 17:18, 35:36, 43:44, 51:54, 63:84)] = 1
res_sub = res[c(1:4, 17:18, 35:36, 43:44, 51:54, 63:84), ]

# grab survey names
surveys = levels(res_sub$survey) 
# relabel for plotting
levels(res_sub$survey) = 1:length(unique(res_sub$survey)) 

pch = c(15, 17, 19, 1, 2, 5, 6)

write.csv(res_sub, file = paste(out_dir, 'final_results_appendix.csv', sep = ''))



pdf(file = paste('figure2_appendix.pdf', sep = ''), width = 10, height = 6)
par(mfrow = c(1, 2))
par(mar=c(5.1,4.1,2.1,0.5))
hist(res_sub[,"tau.hh.SE"]/res_sub[,"tau.sate.SE"], breaks = 20, xlab = expression(SE(hat(tau)[hh])/SE(hat(tau)[SATE])), main = '(a)', col = 'grey', cex.lab = 1.3, cex.axis = 1.3)
abline(v = 1, col = '#778899')

qqnorm(res_sub[,"delta"], ylab = "Standard Normal", xlab = expression(hat(delta)==(hat(tau)[SATE]-hat(tau)[hh])/widehat(SE)), main = '(b)', pch = pch[as.numeric(as.character(res_sub$survey))], cex.lab = 1.3, cex.axis = 1.3)
abline(a = 0, b = 1, col = '#778899')
legend('topleft', surveys, pch =  pch)


dev.off()

# number significant treatment effects [*Note: Results Used in Appendix E*]
c(sum(res_sub$tau.sate.sig), sum(res_sub$tau.hh.sig), sum(res_sub$tau.ps.hh.sig))
# [1] 28 25 26

c(sum(res_sub$tau.sate.sig), sum(res_sub$tau.hh.sig), sum(res_sub$tau.ps.hh.sig))/dim(res_sub)[1]
# [1] 0.7777778 0.6944444 0.7222222

mean(res_sub$tau.hh.SE/res_sub$tau.sate.SE) #[*Note: Results Used in Appendix E, page 17 -- average increase in SE when attempting to estimate PATE*]
# [1] 1.31628

mean(res_sub$tau.ps.hh.SE/res_sub$tau.hh.SE)  #[*Note: Results Used in Appendix E -- average increase in SE when post-stratifying on weight*]
# [1] 1.001998

delta = res_sub[,"delta"]
pvals = apply(cbind(pnorm(delta), pnorm(-delta)), 1, min)


length(pvals)
# [1] 36
sum(pvals < 0.025)
# [1] 1
sum(pvals < 0.05)
# [1] 2

library('astsa') #[*Note: Results Used in Appendix E*]
FDR(pvals)
# NULL





# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 					4. Post-Stratification Analysis
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

res$n = as.numeric(as.character(res$n))
res_forPS = res

nexp = dim(res)[1]

# create submatrix fill with NA
res_PS = res_forPS[1:(nexp/2), ]
# res_PS[,] = NA

for(i in 1:(nexp/2)){
	row = cbind(seq(1,nexp, 2), seq(2,nexp, 2))[i,]
	res_rows = res_forPS[row,]

	tau1 = res_rows[1, c(11, 13, 15, 17)]
	tau2 = res_rows[2, c(11, 13, 15, 17)]
	SE1 = res_rows[1, c(12, 14, 16, 18)]
	SE2 = res_rows[2, c(12, 14, 16, 18)]

	n1 = res_rows$n[1]
	n2 = res_rows$n[2]

	tau = (n1/(n1+n2))* tau1 + (n2/(n1+n2))* tau2
	SE = sqrt((n1/(n1+n2))^2* SE1^2 + (n2/(n1+n2))^2* SE2^2)

	res_PS[i,] = res_rows[1,]
	res_PS[i,c(11, 13, 15, 17)] = tau
	res_PS[i,c(12, 14, 16, 18)] = SE
	res_PS[i,1] = n1 + n2
}

res_PS$tau.sate.L = res_PS[,"tau.sate.est"] - 1.96*res_PS[,"tau.sate.SE"]
res_PS$tau.sate.U = res_PS[,"tau.sate.est"] + 1.96*res_PS[,"tau.sate.SE"]
res_PS$tau.hh.L = res_PS[,"tau.hh.est"] - 1.96*res_PS[,"tau.hh.SE"]
res_PS$tau.hh.U = res_PS[,"tau.hh.est"] + 1.96*res_PS[,"tau.hh.SE"]
res_PS$tau.ps.hh.L = res_PS[,"tau.ps.hh.est"] - 1.96*res_PS[,"tau.ps.hh.SE"]
res_PS$tau.ps.hh.U = res_PS[,"tau.ps.hh.est"] + 1.96*res_PS[,"tau.ps.hh.SE"]

res_PS$tau.sate.sig = ((res_PS$tau.sate.L > 0) | (res_PS$tau.sate.U < 0))
res_PS$tau.hh.sig = ((res_PS$tau.hh.L > 0) | (res_PS$tau.hh.U < 0))
res_PS$tau.ps.hh.sig = ((res_PS$tau.ps.hh.L > 0) | (res_PS$tau.ps.hh.U < 0))

res_PS$delta = (res_PS[,"tau.sate.est"]-res_PS[,"tau.hh.est"])/res_PS[,"tau.star1.SE"]

apply(res_PS[,26:28], 2, sum)
 # tau.sate.sig    tau.hh.sig tau.ps.hh.sig 
 #           28            23            22 


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Read in Survey Weight analysis done without PS 
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# read in output created in "run_paper_data_analysis_noPS.R"
res_noPS = read.csv(paste('final_results_no_PS.csv', sep = ''))[,-1]


improvement_hh = res_noPS[,'tau.hh.SE']/res_PS[,'tau.hh.SE']
improvement_ps_hh = res_noPS[,'tau.hh.SE']/res_PS[,'tau.ps.hh.SE']



results = rbind( c(length(improvement_hh), sum(improvement_hh>1), mean(improvement_hh>1), mean(improvement_hh), median(improvement_hh)),
	c(length(improvement_ps_hh), sum(improvement_ps_hh>1), mean(improvement_ps_hh>1), mean(improvement_ps_hh), median(improvement_ps_hh)))
colnames(results) = c('Experiments', 'num.improved', 'frac.improved', 'mean.improvement', 'median.improvement')
rownames(results) = c('tau.hh', 'tau.ps.hh')

results #[*Note: Results Used in Section 6, page 22 -- SE decrease from PS on partyid*]
#           Experiments num.improved frac.improved mean.improvement
# tau.hh             46           28     0.6086957         1.016914
# tau.ps.hh          46           27     0.5869565         1.013192
#           median.improvement
# tau.hh              1.011051
# tau.ps.hh           1.013515






# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# post-stratification analysis subset experiments
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
i = which(res_PS$keep.indicator == 1)

improvement_hh = res_noPS[i,'tau.hh.SE']/res_PS[i,'tau.hh.SE']
improvement_ps_hh = res_noPS[i,'tau.hh.SE']/res_PS[i,'tau.ps.hh.SE']


results = rbind( c(length(improvement_hh), sum(improvement_hh>1), mean(improvement_hh>1), mean(improvement_hh)),
	c(length(improvement_ps_hh), sum(improvement_ps_hh>1), mean(improvement_ps_hh>1), mean(improvement_ps_hh)))
colnames(results) = c('Experiments', 'num.improved', 'frac.improved', 'mean.improvement')
rownames(results) = c('tau.hh', 'tau.ps.hh')
results #[*Note: Results Used in Appendix E, page 17 -- SE decrease from PS on partyid*]
#           Experiments num.improved frac.improved mean.improvement
# tau.hh             18           12     0.6666667         1.026175
# tau.ps.hh          18           12     0.6666667         1.024526










# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# 			5. Make Experiments Table for Appendix
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #

tmp = res_forPS[seq(1,92, 2),c(1:10, 19)][,c('survey', 'treatment', 'treatment_comp', 'outcome', 'n')]

tmp$nREP = res_forPS[seq(2,92, 2),c(1:10, 19)][,c('n')]

tmp$nDEM = tmp$n
tmp$n <- tmp$nREP + tmp$nDEM 

tmp = tmp[-c(9:16),]
write.csv(tmp, file = 'table1_appendix.csv')

